K-State Honor Code:
On my honor, as a student, I have neither given nor received unauthorized aid on this academic work." A grade of XF can result from a breach of academic honesty
Your name: Mohammad Najjartabar Bisheh
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "https://i.pinimg.com/originals/db/4f/88/db4f88f155d22599f59765e14f4c5497.jpg")
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# import the scatter_matrix functionality
from pandas.plotting import scatter_matrix
import plotly.graph_objects as go
import plotly.express as px
import pingouin as pg
import statsmodels.api as sm
from statsmodels.formula.api import ols
#regression packages
import sklearn.linear_model as lm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
#lasso regression
from sklearn import linear_model
#f_regression (feature selection)
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
# recursive feature selection (feature selection)
from sklearn.feature_selection import RFE
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
# Classifiers
#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier
#for validating your classification model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# grid search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Reading dataset
df = pd.read_csv('movie_metadata.csv\movie_metadata.csv')
df.head()
df.info()
# Describe data
df.describe()
(William McKnight, in Information Management, 2014)
Sources of Poor Data Quality The following are seven sources of data quality issues.
1.Entry quality: Did the information enter the system correctly at the origin?
2.Process quality: Proper checks and quality control at each touchpoint along the path can help ensure that problems are rooted out, but these checks are often absent in legacy processes.
3.Identification quality: Data quality processes can largely eliminate this problem by matching records, identifying duplicates, and placing a confidence score4 on the similarity of records.
4.Integration quality: Is all the known information about an object integrated to the point of providing an accurate representation of the object?
5.Usage quality: Is the information used and interpreted correctly at the point of access?
6.Aging quality: Has enough time passed that the validity of the information can no longer be trusted?
7.Organizational quality: The biggest challenge to reconciliation is getting the various departments to agree that their A equals the other’s B equals the other’s C plus D.
#Finding and deleting duplicated values
len(df[df.duplicated()])
len(df)
# Finding and taking care of missing values
df.isnull().info()
criteria1 = df['budget'] == 0;
criteria2 = df['gross'] == 0;
criteria3 = df['color'] == 0;
criteria4 = df['director_name'] == 0;
criteria5 = df['num_critic_for_reviews'] == 0;
criteria6 = df['duration'] == 0;
criteria7 = df['director_facebook_likes'] == 0;
criteria8 = df['actor_3_facebook_likes'] == 0;
criteria9 = df['actor_2_name'] == 0;
criteria10 = df['actor_1_facebook_likes'] == 0;
criteria11 = df['genres'] == 0;
criteria12 = df['actor_1_name'] == 0;
criteria13 = df['genres'] == 0;
criteria14 = df['movie_title'] == 0;
criteria15 = df['num_voted_users'] == 0;
criteria16 = df['cast_total_facebook_likes'] == 0;
criteria17 = df['actor_3_name'] == 0;
criteria18 = df['facenumber_in_poster'] == 0;
criteria19 = df['plot_keywords'] == 0;
criteria20 = df['movie_imdb_link'] == 0;
criteria21 = df['num_user_for_reviews'] == 0;
criteria22 = df['language'] == 0;
criteria23 = df['content_rating'] == 0;
criteria24 = df['title_year'] == 0;
criteria25 = df['actor_2_facebook_likes'] == 0;
criteria26 = df['imdb_score'] == 0;
criteria26 = df['movie_facebook_likes'] == 0;
criteria = criteria1 & criteria2 & criteria3 & criteria4 & criteria5 & criteria6 & criteria7 & criteria8 & criteria9 & criteria10 & criteria11 & criteria12 & criteria13 & criteria14 & criteria15 & criteria16 & criteria17 & criteria18 & criteria19 & criteria20 & criteria21 & criteria22 & criteria23 & criteria24 & criteria25 & criteria26
# There are some missing values and marked by “”. I replace them with "na"
df['content_rating'] = df['content_rating'].str.replace('“”', 'na')
df['content_rating'] = df['content_rating'].str.replace('0', 'na')
missing = df.isna()
missing2 = missing.sum()
missing2 = missing2.reset_index(level=0)
missing2 = missing2.rename(columns={0: 'factor', 1: 'total'})
missing2 = missing2.set_index('index')
missing2 = missing2.sort_values('factor', ascending=False).plot(kind='bar')
# if I want to do analysis which contain gross, budget, aspect_ratio or content rating I will use dff which is df without "NA".
missing['actor_1_name'].describe()
df['gross'].describe()
df['director_name'].value_counts().head(10)
df['actor_1_name'].value_counts().head(10)
df['budget'].value_counts().head()
df = df.drop_duplicates(keep='first')
len(df)
df.dropna(how='all');
df.dropna(subset=['imdb_score'])
df.info()
# To have a constant dataset I made dff:
dff = df.dropna()
dff.info()
b = pd.DataFrame(dff.genres.str.split('|').tolist(), index=dff.imdb_score).stack()
b = b.reset_index()[[0, 'imdb_score']] # genres variable is currently labeled 0
b.columns = ['genres', 'imdb_score'] # renaming genres
b.head()
# just a different way
df2 = dff.join(dff.pop('genres').str.get_dummies('|'))
df2.head()
b['genres'].value_counts().plot(kind='bar')
b.groupby('genres')['imdb_score'].mean().plot.bar()
dff['content_rating'].value_counts()
#First method
dff_replace = dff.replace({'content_rating': 'GP'}, {'content_rating': 'R'});
dff_replace = dff_replace.replace({'content_rating': 'M'}, {'content_rating': 'R'});
dff_replace = dff_replace.replace({'content_rating': 'Passed'}, {'content_rating': 'R'});
dff_replace = dff_replace.replace({'content_rating': 'NC-17'}, {'content_rating': 'R'});
dff_replace['content_rating'].value_counts()
#second method
# create a function
def f(x):
if x == 'R': return 'R'
elif x == 'PG-13': return 'PG-13'
elif x == 'PG': return 'PG'
elif x == 'G': return 'G'
elif x == 'Not Rated': return 'Not Rated'
elif x == 'Unrated': return 'Unrated'
elif x == 'Approved': return 'Approved'
elif x == 'X': return 'X'
else: return 'R'
dff_replace['content_rating'] = dff['content_rating'].apply(f)
dff_replace['content_rating'].value_counts()
dff['profit'] = dff ['gross'] - dff['budget']
dff.info()
dff['return_on_investment_perc'] = (dff ['profit'] / dff['budget']) * 100
dff.info()
#dff = dff.set_index('movie_title');
dff['profit'].head()
best_profit = dff.pivot_table(index='movie_title', aggfunc='sum', fill_value=0).sort_values(by=['profit'], ascending=False).head(20)
best_profit.head()
plt.figure(figsize=[16,16])
pieprofit = dff[['movie_title','profit','budget','return_on_investment_perc', 'gross','imdb_score']];
pieprofit = pieprofit.sort_values(by=['profit'], ascending = False).head(10);
explode = (0.2, 0, 0, 0, 0.1, 0, 0, 0.15, 0, 0) # explode 1st slice
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','maroon', 'aqua', 'khaki', 'darkturquoise', 'hotpink', 'mediumpurple']
plt.pie(pieprofit['profit'], labels=pieprofit['movie_title'], explode=explode, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
dff['color'].value_counts().head()
plt.figure(figsize=[16,16])
piereturn = dff[['movie_title','profit','budget','return_on_investment_perc', 'gross','imdb_score']];
piereturn = piereturn.sort_values(by=['return_on_investment_perc'], ascending = False).head(10);
explode = (0.2, 0, 0, 0, 0.1, 0, 0, 0.15, 0, 0) # explode 1st slice
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','maroon', 'aqua', 'khaki', 'darkturquoise', 'hotpink', 'mediumpurple']
plt.pie(piereturn['return_on_investment_perc'], labels=piereturn['movie_title'], explode=explode, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
piereturn = dff[['movie_title','profit','budget','return_on_investment_perc', 'gross','imdb_score']];
piereturn = piereturn.sort_values(by=['return_on_investment_perc'], ascending = False).head(20)
#px.scatter(piereturn, x="budget", y="return_on_investment_perc", text ='movie_title' , trendline="lowess")
piereturn
#let's get some general information from language and country
dff['language'].value_counts().head(16)
dff['country'].value_counts().head(20)
e = dff.loc[df['country'] == 'Iran']
e.groupby(['movie_title','imdb_score'])['profit'].sum().sort_values(ascending=False).head()
e = dff.loc[df['language'] == 'Persian']
e.groupby(['movie_title','imdb_score'])['profit'].sum().sort_values(ascending=False).head()
# I want to draw pareto chart
def pareto_plot(df, x=None, y=None, title=None, show_pct_y=False, pct_format='{0:.0%}'):
xlabel = x
ylabel = y
tmp = df.sort_values(y, ascending=False)
x = tmp[x].values
y = tmp[y].values
weights = y / y.sum()
cumsum = weights.cumsum()
fig, ax1 = plt.subplots()
ax1.bar(x, y)
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
ax2 = ax1.twinx()
ax2.plot(x, cumsum, '-ro', alpha=0.5)
ax2.set_ylabel('', color='r')
ax2.tick_params('y', colors='r')
vals = ax2.get_yticks()
ax2.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
# hide y-labels on right side
if not show_pct_y:
ax2.set_yticks([])
formatted_weights = [pct_format.format(x) for x in cumsum]
for i, txt in enumerate(formatted_weights):
ax2.annotate(txt, (x[i], cumsum[i]), fontweight='heavy')
if title:
plt.title(title)
plt.tight_layout()
plt.show()
pareto = dff.sort_values(by=['budget'], ascending = False).head(8)
pareto_plot(pareto, x='movie_title', y='gross', title='profit pareto')
dff.groupby(['movie_title','imdb_score'])['profit'].sum().sort_values(ascending=False).head(10)
dff.groupby(['movie_title','profit'])['imdb_score'].sum().sort_values(ascending=False).head(10)
fig = px.scatter(dff, x="imdb_score", y="gross")
fig.add_trace(
go.Scatter(
x=[8, 8],
y=[0, 800000000],
mode="lines",
line=go.scatter.Line(color="gray"),
showlegend=False)
)
fig.add_trace(
go.Scatter(
x=[1, 10],
y=[500000000, 500000000],
mode="lines",
line=go.scatter.Line(color="gray"),
showlegend=False)
)
fig.show()
fig = px.scatter(pieprofit, x="imdb_score", y="gross", text ='movie_title')
fig.add_trace(
go.Scatter(
x=[8, 8],
y=[50000000, 800000000],
mode="lines",
line=go.scatter.Line(color="gray"),
showlegend=False)
)
fig.add_trace(
go.Scatter(
x=[6, 9],
y=[500000000, 500000000],
mode="lines",
line=go.scatter.Line(color="gray"),
showlegend=False)
)
fig.show()
df1 = dff
# setting my own values for bins
df1['imdbscores_bins'] = pd.cut(df1['imdb_score'], bins=[0, 2, 4, 6, 8, 10], labels=['0-1.99', '2-3.99', '4-5.99', '6-7.99', '8-10'],
include_lowest=True)
# see the result
df1.head()
#labels=['trash', 'tolerable', 'good', 'accaptable', 'incredible', 'unbelievable']
df1 = df1.replace({'imdbscores_bins': "6-7.99"}, {'imdbscores_bins': 'incredible'})
df1 = df1.replace({'imdbscores_bins': "0-1.99"}, {'imdbscores_bins': 'trash'})
df1 = df1.replace({'imdbscores_bins': "2-3.99"}, {'imdbscores_bins': 'tolerable'})
df1 = df1.replace({'imdbscores_bins': "4-5.99"}, {'imdbscores_bins': 'accaptable'})
df1 = df1.replace({'imdbscores_bins': "8-10"}, {'imdbscores_bins': 'unbelievable'})
df1.head()
fig = px.scatter_3d(dff, x='return_on_investment_perc', y='country', z='imdb_score')
fig.show()
fig = px.scatter_3d(df1, x='return_on_investment_perc', y='country', z='imdbscores_bins')
fig.show()
fig = px.scatter_3d(pieprofit, x='profit', y='return_on_investment_perc', z='imdb_score', text = 'movie_title')
fig.show()
fig = px.scatter(dff, x="country", y="imdb_score", marginal_y="rug", marginal_x="histogram")
fig.show()
fig = px.scatter(dff, x="gross", y="imdb_score", marginal_y="violin",
marginal_x="box", trendline="ols")
fig.show()
plt.figure(figsize=[24,16])
fig = px.scatter_matrix(dff, dimensions=["imdb_score", "actor_1_facebook_likes", "actor_2_facebook_likes"])
fig.show()
df1['imdbscores_bins'].value_counts().head()
fig = px.parallel_coordinates(dff, color="imdb_score", labels={"actor_1_facebook_likes": "Actor 1 likes",
"actor_2_facebook_likes": "Actor 2 likes", "actor_3_facebook_likes": "Actor 3 likes",
"director_facebook_likes": "Director likes", },
color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2)
fig.show()
earth = dff.groupby(['country', 'gross'])['title_year'].sum(
).sort_values()
earth.head()
earth2=dff.pivot_table(index=['country','title_year'], values='gross',
aggfunc='sum', fill_value=0, margins=True).reset_index()
#earth2=earth2['country'].value_counts()
earth2.head()
fig = px.choropleth(earth2, locations="country", color="gross", hover_name="country", animation_frame="title_year", range_color=[0,999999999])
fig.show()
#dff.groupby('imdb_score').hist(figsize=(10,10));
df1.groupby('imdbscores_bins')['actor_1_facebook_likes'].sum().plot.bar();
df1.groupby('imdbscores_bins')['actor_2_facebook_likes'].sum().plot.bar();
df1.groupby('imdbscores_bins')['actor_3_facebook_likes'].sum().plot.bar();
sns.lmplot("actor_1_facebook_likes", "director_facebook_likes", df1, hue="imdbscores_bins", x_jitter=.15, height=8)
#ff.groupby('imdbscores_bins')['duration'].sum().plot.bar();
sns.violinplot("duration", "imdbscores_bins", data=df1,
palette=["lightblue", "lightpink"])
#dff.groupby('imdbscores_bins')['num_critic_for_reviews'].sum().plot.bar();
sns.violinplot("num_critic_for_reviews", "imdbscores_bins", data=df1,
palette=["lightblue", "lightpink"])
dffcorr = dff.corr()
dffcorr
dffcorr ['imdb_score']
plt.figure(figsize=(14,10))
sns.heatmap(dff.corr(), vmax=.8, square=True, annot=True, fmt=".1f")
sns.jointplot("imdb_score", "duration", dff, kind="hex", color="#8855AA")
sns.jointplot("imdb_score", "num_critic_for_reviews", dff, kind="hex", color="#8855AA")
sns.jointplot("imdb_score", "num_user_for_reviews", dff, kind="hex", color="#8855AA")
cw_lm=ols('imdb_score ~ num_critic_for_reviews + duration + director_facebook_likes + actor_3_facebook_likes + actor_1_facebook_likes + gross + num_voted_users + cast_total_facebook_likes + facenumber_in_poster + num_user_for_reviews + budget + actor_2_facebook_likes + actor_2_facebook_likes + aspect_ratio + movie_facebook_likes + profit + return_on_investment_perc',
data=dff).fit() #Specify C for Categorical
print(sm.stats.anova_lm(cw_lm, typ=2))
# Anova test
cw_lm2=ols('imdb_score ~ num_critic_for_reviews + duration + num_voted_users + num_user_for_reviews + movie_facebook_likes',
data=dff).fit() #Specify C for Categorical
print(sm.stats.anova_lm(cw_lm2, typ=2))
runs_reg_model1 = ols("imdb_score ~ num_critic_for_reviews + duration + num_voted_users + num_user_for_reviews + movie_facebook_likes",dff)
runs_reg1 = runs_reg_model1.fit()
print(runs_reg1.summary())
runs_reg1.mse_resid
runs_reg_model = ols("imdb_score ~ profit",dff)
runs_reg2 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg2.mse_resid
runs_reg_model = ols("imdb_score ~ num_critic_for_reviews",dff)
runs_reg3 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg3.mse_resid
runs_reg_model = ols("imdb_score ~ duration",dff)
runs_reg4 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg4.mse_resid
runs_reg_model = ols("imdb_score ~ num_voted_users",dff)
runs_reg5 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg5.mse_resid
runs_reg_model = ols("imdb_score ~ num_user_for_reviews",dff)
runs_reg6 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg6.mse_resid
runs_reg_model = ols("imdb_score ~ movie_facebook_likes",dff)
runs_reg7 = runs_reg_model.fit()
print(runs_reg1.summary())
runs_reg7.mse_resid
df5 = dff[['imdb_score', 'num_critic_for_reviews','duration','num_voted_users','num_user_for_reviews', 'movie_facebook_likes']];
y = df5['imdb_score']
X = df5.drop(['imdb_score'], axis =1)
model1 = linear_model.Lasso(alpha=1)
model1.fit(X, y)
model1_y = model1.predict(X)
print('Coefficients: ', model1.coef_)
print("y-intercept ", model1.intercept_)
coef = ["%.3f" % i for i in model1.coef_]
xcolumns = [ i for i in X.columns ]
list(zip(xcolumns, coef))
print("mean square error: ", mean_squared_error(y, model1_y))
print("variance or r-squared: ", explained_variance_score(y, model1_y))
X_new = SelectKBest(f_regression, k=2).fit_transform(X, y)
X_new
# this helps us find out which variables are selected
selector = SelectKBest(f_regression, k=2).fit(X, y)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)
model2 = lm.LinearRegression()
model2.fit(X_new, y)
model2_y = model2.predict(X_new)
print("mean square error: ", mean_squared_error(y, model2_y))
print("variance or r-squared: ", explained_variance_score(y, model2_y))
selector = SelectKBest(f_regression, k=3).fit(X, y)
idxs_selected = selector.get_support(indices=True)
model3 = lm.LinearRegression()
model3.fit(X_new, y)
model3_y = model3.predict(X_new)
print("mean square error: ", mean_squared_error(y, model3_y))
print("variance or r-squared: ", explained_variance_score(y, model3_y))
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import ward_tree
from scipy.cluster.hierarchy import dendrogram, linkage, ward
from sklearn.metrics import pairwise_distances
np.random.seed(1) # setting random seed to get the same results each time.
agg= AgglomerativeClustering(n_clusters=4, linkage='ward').fit(X)
agg.labels_
plt.figure(figsize=(16,8))
linkage_matrix = ward(X)
dendrogram(linkage_matrix, orientation="top")
plt.tight_layout() # fixes margins
plt.figure(figsize=(16,8))
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
linkage_matrix = ward(X)
dendrogram(linkage_matrix,
truncate_mode='lastp', # show only the last p merged clusters
p=4, # show only the last p merged clusters
#show_leaf_counts=False, # otherwise numbers in brackets are counts
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches
orientation="top")
plt.tight_layout() # fixes margins
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
pca = PCA(n_components=2, svd_solver='full')
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
pca = PCA(n_components=1, svd_solver='arpack')
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=2, batch_size=3)
ipca.fit(X)
ipca.transform(X)
df6 = dff.dropna()
df6 = df6[['imdbscores_bins', 'num_critic_for_reviews','duration','num_voted_users','num_user_for_reviews', 'movie_facebook_likes']];
y = df6['imdbscores_bins']
X = df6.drop(['imdbscores_bins'], axis =1)
df6 = df.dropna()
df6 = df6[['imdb_score', 'num_critic_for_reviews','duration','num_voted_users','num_user_for_reviews', 'movie_facebook_likes']];
# setting my own values for bins
df6['imdbscores_bins'] = pd.cut(df6['imdb_score'], bins=[0, 4, 6, 8, 10], labels=[1, 2, 3, 4],
include_lowest=True)
# see the result
df6.info()
df6 = df6.drop(['imdb_score'], axis =1)
df6['imdbscores_bins'] = df6.imdbscores_bins.astype(int)
df6.info()
y = df6['imdbscores_bins']
X = df6.drop(['imdbscores_bins'], axis =1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train);
print(metrics.accuracy_score(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
#print(metrics.roc_auc_score(y_test, dt.predict(X_test)));
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_true=np.array(y_test), y_pred=dt.predict(X_test))
plt.show()
from sklearn.externals.six import StringIO
import pydotplus
dot_data = StringIO()
tree.export_graphviz(dt, out_file=dot_data, feature_names=X.columns,
filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
from graphviz import Source
from sklearn import tree
Source( tree.export_graphviz(dt, out_file=None, feature_names=X.columns))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(metrics.accuracy_score(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
#print(metrics.roc_auc_score(y_test, knn.predict(X_test)))
k_range = range(1, 10)
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores.append(np.mean(cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')))
#plt.figure()
plt.plot(k_range, scores)
plt.xlabel('k value')
plt.ylabel('accuracy')
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
gb = GradientBoostingClassifier(n_estimators=100, random_state=0)
gb.fit(X_train, y_train)
print(metrics.accuracy_score(y_test, gb.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, gb.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, gb.predict(X_test)))
print("--------------------------------------------------------")
# 10-fold cross-validation
scores = cross_val_score(gb, X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())
svm = SVC(gamma='scale', probability=True)
svm.fit(X_train, y_train)
print(metrics.accuracy_score(y_test, svm.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, svm.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, svm.predict(X_test)))
print("--------------------------------------------------------")
# 10-fold cross-validation
svm = SVC(gamma='auto')
scores = cross_val_score(svm, X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())
nn = MLPClassifier(solver='lbfgs', max_iter=500)
nn.fit(X_train, y_train)
print(metrics.accuracy_score(y_test, nn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, nn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, nn.predict(X_test)))
print("--------------------------------------------------------")
dfff = df.dropna()
dfff = dfff[['gross', 'duration', 'aspect_ratio', 'num_critic_for_reviews', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'cast_total_facebook_likes', 'num_user_for_reviews', 'budget', 'actor_2_facebook_likes', 'imdb_score']];
dffff = dfff[['gross', 'budget','num_user_for_reviews','imdb_score']];
k_means = KMeans(init='k-means++', n_clusters=4, random_state=0)
k_means.fit(dfff)
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.datasets import load_nfl
visualizer = SilhouetteVisualizer(k_means, colors='yellowbrick')
visualizer.fit(dfff) # Fit the data to the visualizer
visualizer.show()
from scipy.spatial.distance import cdist
K = range(1, 10)
meandistortions = []
for k in K:
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(dfff)
meandistortions.append(sum(np.min(cdist(dfff, kmeans.cluster_centers_, 'euclidean'), axis=1)) / dfff.shape[0])
plt.plot(K, meandistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
range_n_clusters = list (range(2,10))
print ("Number of clusters from 2 to 9: \n", range_n_clusters)
Image(url= "https://s.hdnux.com/photos/01/03/74/10/17809955/3/480x480.png")